home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Aminet 1 (Walnut Creek)
/
Aminet - June 1993 [Walnut Creek].iso
/
aminet
/
util
/
pack
/
xpkrdcn21.lha
/
xpkRDCN
/
compr.asm
next >
Wrap
Assembly Source File
|
1993-04-10
|
11KB
|
314 lines
;---------------------------------------------------------------------------;
; ;
; This is the Ross Data Compression implemented in 68000 assembler ;
; by Niklas Sjöberg, 2:203/415.3@Fidonet ;
; ;
; ;
; The source is fully in the Public Domain. If you think you can improve ;
; some parts, feel free to modify the code. HOWEVER, be carefull to comment ;
; the parts where you change! ;
; ;
; On entry (if you compre to the C-code): ;
; ;
; a0 = ctrl_idx, pointer to 'outbuff' where the 16 bits control-word is ;
; stored after each 16th sequence. On startup, ctrl_idx is set to ;
; the start of 'outbuff' and 'outbuff' is advanced two bytes. ;
; a1 = in_idx, pointer to next byte of input ;
; a2 = out_idx, pointer to 'outbuff' ;
; a3 = inbuff_end, end of input buffer (could you guess? :) ;
; a4 = anchor, temporary pointer to in_idx, undefined on entry! ;
; a5 = pat_idx, temporary pointer to pattern in dictionary, undefined on ;
; entry! ;
; a6 = hash_tbl (uchar **hash_tbl) pointer. Size is 16K, just enough for ;
; 4096 pointers. ;
; d0 = ctrl_cnt, used to keep track of when control-word is to be written ;
; to ctrl_idx. (every 16th sequence). Start at 1(!), counts to 16 ;
; 0 one startup, but note that d0 is increased by one _before_ it ;
; actually is used.
; d1 = c. Stores various chars when comparing source and destination. ;
; always byte-size. Zero on startup. ;
; d2 = cnt. Stores different values. Used as a 'quick counter' instead of ;
; subtracting to addresses. Zero on startup. ;
; d3 = ctrl_bits, 16 bit compression code. Zero on startup. ;
; d4 = gap. Used as counter to see if pattern occured with 4098 range. ;
; Zero on startup. ;
; d5 = hash. 16 bit hash-value used as dictionary lookup. Always in range ;
; from 0 to 4095. Zero on startup. ;
; d6 = outbuff_end, end of 'outbuff'. If we exceed this limit (actually we ;
; have a 48 byte margin) data couldn't be compressed! ;
; ;
; ;
;---------------------------------------------------------------------------;
; Since both a4 and a5 just are used temprarily they're used as temporary ;
; scratch in some places. Similary d5 (hash) and d7 are used as scratch ;
;---------------------------------------------------------------------------;
;
;
; Version changes:
; 921205 - ctrl_cnt (d0) now count from 16 and downwards. Slightly faster.
; Now uses bset instead of asl and or.
; 921205 - Due to optimizing all comments may not be totally accurate. Ie.
; you have to read 5 lines at a time in order to understand the
; comments..
; 930325 - Some optimizations by John Harris - jharris@cup.portal.com
; All new opts marked with a '@'. Cycle savings at 68000 level:
; - Changed branches in ploop, and replaced "tst, bne, sub" with "dbf"
; Saved 14.25 cycles per loop.
; - RLE check 20-74 cycles faster when RLE fails, or 52 + 38 per loop
; when RLE is successful
; - Pattern check is an additional 12 faster, plus 48 per loop
; 930409 - (Niklas Sjoberg) Fixed some lethal bugs. Might take a few more
; cycles but compression won't be safe otherwise!
xdef _pack_one ; only one routine, called from the
; C-part of the library
INCLUDE "libraries/xpksub.i"
section code
_pack_one
movem.l a2-a6/d2-d7,-(SP) ; Save non-scratch registers
moveq #16,d0
moveq #$0,d1
moveq #$0,d2
moveq #$0,d3
moveq #$0,d4
moveq #$0,d5
moveq #$0,d7
suba.l a4,a4 ; not necessary, but may help debugging.
suba.l a5,a5
bra.s pstart ; @ Make spot to place exit sections so
; ; some branches can be made short, or
; ; made to fall through most of the time
; ; @ Moved overflow and ploop_end here
overflow
moveq #0,d0 ; If overflow the C-code will return
bra.s p_exit ; XPKERR_EXPANSION
; ; This and ploop_end are the only exit points
ploop_end
; We ran out of buffer, phew!
; move.w #16,d7 ; make sure the last control bits
; sub.w d0,d7 ; are written to ctrl_idx first-
; asl.w d7,d3
; move.w d3,(a0) ; *ctrl_idx = ctrl_bits
move.b d3,1(a0) ;store controlword
asr.w #8,d3 ; Now works with 68000 as well..
move.b d3,(a0)
move.l a2,d0 ; return out_idx, on error (overflow)
p_exit movem.l (SP)+,a2-a6/d2-d7 ; we return 0
rts
pstart
; move.l d0,-(SP)
; move.l a3,d0
; sub.l a1,d0
; cmp.l #8170,d0
; bhi cont
; nop
cont
; move.l (SP)+,d0
cmpa.l a1,a3 ; Main-loop, running as long as in_idx
bls.s ploop_end ; doesn't exceed outbuff_end.
; tst.b d0 ; @Is it time to write controlbits into
; bne.s no_makeroom ; @ctrl_idx?
dbf d0,no_makeroom ; @ Replaced above two lines, and
; ; subq #1 later, with this DBF
; move.w d3,(a0) ; store control-word and
move.b d3,1(a0) ;Now works on 68000
asr.w #8,d3
move.b d3,(a0)
moveq #0,d3
moveq #15,d0 ; advance out_idx two bytes.
move.l a2,a0 ; but first store address where
addq.w #2,a2 ; the next word is to be written
cmp.l d6,a2 ; Make sure there is no overflow.
bhi.s overflow ; @ Chg branch priority out_idx really should be < outbuff_end :)
no_makeroom
; subq.b #1,d0 ; @ Deleted
; ; @ Deleted label "no_overflow"
move.l a1,a4 ; Use anchor = in_idx
move.b (a1)+,d1 ; First byte to compare. in_idx is advanced
cmp.b (a1)+,d1 ; @ Since most of the time will not be
bne.s n_rleseq ; rle, I put two compares outside
cmp.b (a1)+,d1 ; the loop, for much less overhead
bne.s n_rleseq ; and elimination of cmpi.w #2 later
cmp.l a1,a3
bcs.s cant_compress ; @ less than 3 bytes left in inbuff
; ; @ Deleted old rep1 loop
; moveq #1,d2
;rep1
; cmp.b (a1),d1 ; Does next char match the one before
; bne.s nrep
; cmp.w #4114,d2 ; Make sure we don't exceed the longest
; bge.s nrep ; possible RLE
; cmpa.l a1,a3 ; Make sure we're not running out of source
; bls.s nrep
; addq.w #1,a1 ; Bytes obviously was equal, advance in_idx
; addq.w #1,d2 ; to next byte.
; bra.s rep1
;nrep cmpi.w #$2,d2 ; @ Deleted - cnt<3 handled outside loop
; bls.s n_rleseq ; @
; ; @ rep1 now counts down from 4111. Adjusted
; ; after loop to # of cnts past 3. This is
; ; over twice as fast as the old loop.
move.w #4111,d2
move.w d2,d7
rep1
cmpa.l a1,a3
bls.s nrep ; @ If exited on cmp with inbuff_end, adrs & cnt
cmp.b (a1)+,d1
bne.s adj_a1
dbf d7,rep1
; ; are correct.
moveq #0,d7 ; @ Otherwise, both need to be adjusted by 1
adj_a1 subq.w #1,a1 ; @ a1 is one higher than it should be
; ; because of post index
nrep sub.w d7,d2 ; @ Adjust d2 = # of cnts past 3
cmpi.w #15,d2 ; @ adjusted for new d2
bgt.s l_rlen ; if cnt <= 18 we use short form of RLE
; subq.b #3,d2 ; @ Deleted. count, at least three chars
move.b d2,(a2)+
move.b d1,(a2)+ ; the character itself
bset d0,d3
; asl.w #1,d3 ; Shift and set control bits
; or.w #1,d3
bra.s pstart ; re-start loop, until in_idx => outbuff_end
l_rlen ; If cnt > 18 we encode a long RLE (max 4114 chars)
sub.w #16,d2 ; @ Adjusted. cnt -= 19, at least 19 repeated chrs
move.b d2,d7 ; out_idx++ = 16 + (cnt & 0x0F) Store low count
and.b #15,d7 ; and code
add.b #16,d7
move.b d7,(a2)+
asr.w #4,d2 ; out_idx++ = cnt >> 4
move.b d2,(a2)+ ; and store high count
move.b d1,(a2)+ ; the character itself
; asl.w #1,d3 ; Just like above, set and shift
; or.w #1,d3
bset d0,d3
bra.w pstart
; ; @ Moved cant_compress here to make the best
; ; use of short branches
cant_compress
move.b d1,(a2)+ ; Data couldn't be compressed so
; addq.l #1,a4 : just copy to out_idx, and set
; move.l a4,a1 ; in_idx to next byte of data
lea 1(a4),a1
; asl.w #1,d3 ; Do NOT set control bit, just shift
bra.w pstart
n_rleseq
; ; @ ALERT This check was changed to .w, because
; ; the input buffer size is <64K. This is
; ; faster, but needs to be changed back to .l
; ; if inbuff will ever be increased above 64K
move.w a3,d5 ; inbuff_end - in_idx > 2
sub.w a4,d5 ; we must have at least 3 characters left in
subq.w #$2,d5 ; @ chg cmpi to subq. order to calc the hash.
bls.s cant_compress ; if less bytes left, just copy them
move.l a4,a1 ; in_idx = anchor, restore to original start
moveq #0,d5 ; hash
moveq #0,d7 ; scratch
; Now lets calculate the hash, in C this is done as:
; unsigned short int hash = ((((in_idx[0] & 15) << 8) | in_idx[1]) ^
; ((in_idx[0] >> 4) | (in_idx[2] << 4))) & 4096-5
;
; cnt (=d2) is used as scratch here..
move.b (a1)+,d5 ; @ chged to '(a1)+' in_idx[0] & 15 << 8
move.b d5,d7 ; in_idx[0]
and.b #15,d5
asl.w #8,d5
or.b (a1)+,d5 ; @ '+' | in_idx[1]
asr.b #4,d7 ; ^ in_idx[0] >> 4
moveq #0,d2 ; gee, almost ran out of registers :)
move.b (a1)+,d2 ; @ '+' in_idx[2] << 4
asl.w #4,d2
or.w d2,d7
eor.w d7,d5
and.w #4096-1,d5
asl.w #2,d5 ; char * is 4 bytes, adjust offset
move.l a4,a1 ; @ Restore a1
; The zero here is used in order to stop SAS asm from stupid complaining!
move.l 0(a6,d5.w),a5 ; pat_idx now zero or pointer to pattern
move.l a1,0(a6,d5.w) ; store in_idx at this offset
move.l a1,d4 ; gap = in_idx - pat_idx, ie. if
move.l a5,d7 ; pat_idx was zero, gap will be huge
sub.l d7,d4 ; which showed that we didn't find
cmp.l #4098,d4 ; a pattern. That's why .l is used here!
bgt.s cant_compress
move.w #271,d2 : @ chged loop to count from 271 down
move.w d2,d7 ; @
rep2
cmpa.l a5,a4 ; and pat_idx doesn't advance into the source!
bls.s nrep2 ; this is VERY important!!
cmpa.l a1,a3 ; While we don't run out of source..
bls.s nrep2 ; @ If exited on cmp with inbuff_end, adrs & cnt
cmpm.b (a5)+,(a1)+ ; @ Changed to cmpm.b
bne.s adj2_a1
dbf d7,rep2 ; @ chged to dbls
; ; are correct.
moveq #0,d7 ; @ Otherwise, both need to be adjusted by 1
adj2_a1 subq.w #1,a1 ; @ a1 is one higher than it should be
nrep2 sub.w d7,d2
cmpi.w #2,d2 ; If cnt <= 2 we can't compress!
bls.s cant_compress ; @ Changed to .s
subq.w #3,d4 ; gap -= 3, at least length of three
cmpi.w #15,d2 ; if cnt <= 15 we encode as short pattern
bgt.s is_lpat
asl.b #4,d2 ; *out_idx++ = (cnt << 4) + (gap & 0x0F)
move.b d4,d7 ; store count (low) and low offset
and.b #15,d7
add.b d7,d2
move.b d2,(a2)+
asr.w #4,d4 ; *out_idx++ = gap >> 4
move.b d4,(a2)+ ; store high offset in hash-table
; asl.w #1,d3 ; set and shift control-bits
; or.w #1,d3
bset d0,d3
bra.w pstart ; back to main-loop
; cnt > 15 so encode as long pattern
is_lpat
move.b d4,d7 ; *out_idx++ = 32 + (gap & 0x0F)
and.b #15,d7 ; store 2 (low) and low offset
add.b #32,d7
move.b d7,(a2)+
asr.w #4,d4 ; *out_idx++ = gap >> 4
move.b d4,(a2)+ ; store high offset
sub.w #16,d2 ; *out_idx++ = cnt - 16;
move.b d2,(a2)+ ; store count (since length is at least
; 16 we can subtract 16 and thus handle
; patterns <= 271
bset d0,d3
bra.w pstart
END